library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.1 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(RColorBrewer)
library(stringr)
setwd("/Users/binitafebles/Desktop/Handley_Lab/Artic_renamed/combined_snpsift")
#load samples from all five plates separately
df1<-read.csv("feb_sample.tsv", sep = "")
df2<-read.csv("mar_sample.tsv", sep = "")
df3<-read.csv("apr_sample.tsv", sep = "")
df4<-read.csv("may_sample.tsv", sep = "")
df5<-read.csv("june_sample.tsv", sep = "")
df6<-read.csv("july_sample.tsv", sep = "")
# concatenate all dfs into one by rows (rbind)
combined_df<-rbind(df1,df2,df3,df4,df5,df6)
#remove rows without any snpeff annotations (most likely samples without any mutations)
combined_df1<-combined_df[!(combined_df$SAMPLE=="SAMPLE"),]
#rename certain column names
combined_df2<-rename(combined_df1,EFFECT=ANN.0..EFFECT,GENE=ANN.0..GENE, HGVS_P=ANN.0..HGVS_P)
combined_df2$POS<-as.numeric(combined_df2$POS)
#some of the gene position aren't correctly annotated. so created new column with Gene name based on gene position.
combined_df3<-combined_df2%>%
mutate(GENE_POS = case_when(
POS>=1 & POS<=265 ~ "5'UTR",
POS>=266 & POS<=21555 ~ 'ORF1ab',
POS>=21563 & POS<=25384 ~ 'S',
POS>=25393 & POS<=26220 ~ 'ORF3a',
POS>=26245 & POS<=26472 ~ 'E',
POS>=26523 & POS<=27191 ~ 'M',
POS>=27202 & POS<=27387 ~ 'ORF6',
POS>=27394 & POS<=27759 ~ 'ORF7a',
POS>=27756 & POS<=27887 ~ 'ORF7b',
POS>=27894 & POS<=28259 ~ 'ORF8',
POS>=28274 & POS<=29533 ~ 'N',
POS>=29558 & POS<=29674 ~ 'ORF10',
POS>=29675 & POS<=29903 ~ "3'UTR",
TRUE ~ GENE))
#add new col 'VARTYPE' to determine whether the variant is snp,insertion or deletion based on the length of characters in REF and ALT col.
combined_df4<-combined_df3%>%
mutate(VARTYPE = case_when(
nchar(ALT) == nchar(REF) ~ "SNP",
nchar(ALT) > nchar(REF) ~ "INS",
nchar(ALT) < nchar(REF) ~ "DEL"
))
combined_df5<-combined_df4%>%
mutate(AA_Change=
str_replace_all(HGVS_P,c(
'Ala'= 'A','Arg'='R','Asn'='N','Asp'='D','Cys'='C','Gln'='Q',
'Glu'='E','Gly'='G','His'='H','Ile'='I','Leu'='L','Lys'='K',
'Met'='M','Phe'='F','Pro'='P','Ser'='S','Thr'='T','Trp'='W',
'Tyr'='Y','Val'='V','Asx'='B','Glx'='Z')))
variant_plot<-combined_df5 %>%
ggplot(aes(x=POS,y=SAMPLE))+
geom_rect(aes(linetype= "ORF1ab"),xmin = 266,xmax = 21555,ymin = -Inf,ymax = Inf,fill="yellow2",alpha = 0.01)+
geom_rect(aes(linetype= "S"),xmin = 21563,xmax = 25384,ymin = -Inf,ymax =Inf,fill="powderblue",alpha = 0.01)+
geom_rect(aes(linetype= "ORF3a"),xmin = 25393,xmax = 26220,ymin = -Inf,ymax = Inf,fill="orange", alpha = 0.01)+
geom_rect(aes(linetype= "E"),xmin = 26245,xmax = 26472,ymin = -Inf,ymax = Inf,fill="lightslateblue",alpha = 0.01)+
geom_rect(aes(linetype= "M"),xmin = 26523,xmax = 27191,ymin = -Inf,ymax = Inf,fill="palevioletred1",alpha = 0.01)+
geom_rect(aes(linetype= "ORF6"),xmin = 27202,xmax = 27387,ymin = -Inf,ymax = Inf,fill="darkseagreen1",alpha = 0.01)+
geom_rect(aes(linetype= "ORF7a"),xmin = 27394,xmax = 27759,ymin = -Inf,ymax = Inf,fill="tan",alpha = 0.01)+
geom_rect(aes(linetype= "ORF7b"),xmin = 27756,xmax = 27887,ymin = -Inf,ymax = Inf,fill="lightsalmon",alpha = 0.01)+
geom_rect(aes(linetype= "ORF8"),xmin = 27894,xmax = 28259,ymin = -Inf,ymax = Inf,fill="orchid1",alpha = 0.01)+
geom_rect(aes(linetype= "N"),xmin = 28274,xmax = 29533,ymin = -Inf,ymax = Inf,fill="aquamarine",alpha = 0.01)+
geom_rect(aes(linetype= "ORF10"),xmin = 29558,xmax = 29674,ymin = -Inf,ymax = Inf,fill="olivedrab1",alpha = 0.01)+
scale_linetype_manual(name = "GENE",values= c("ORF1ab"=0,"S"=0,"ORF3a"=0,"E"=0,"M"=0,"ORF6"=0,"ORF7a"=0,"ORF7b"=0,"ORF8"=0,"N"=0,"ORF10"=0),
labels=c("ORF1ab","S","ORF3a","E","M","ORF6","ORF7a","ORF7b","ORF8","N","ORF10"),
guide=guide_legend(override.aes = list(fill = c("yellow2","powderblue","orange","lightslateblue","palevioletred1","darkseagreen1","tan","lightsalmon","orchid1","aquamarine","olivedrab1"),
alpha = .5)))+
scale_x_continuous(breaks = c(0,2500,5000,7500,10000,12500,15000,17500,20000,22500,25000,27500,30000), expand = c(0, 0))+
geom_jitter(position = position_jitter(width = 0.05, height = 0.05), alpha=1.5,aes(color=VARTYPE))+
labs(x="GENE",y="SAMPLE", title = "Variants ")+
theme(plot.title = element_text(hjust = 0.5, face = "bold"))+
scale_color_manual(values=c("SNP" = "gray40","DEL" = "orangered", "INS" ="blue"),name = "VARIANT")+
theme(axis.text.y = element_blank())+
theme(legend.position = "bottom")
variant_plot
S_gene<-combined_df5%>%
select(SAMPLE,POS,GENE_POS,VARTYPE,EFFECT,AA_Change)%>%
filter(GENE_POS=="S")
s_plot<-S_gene%>%
ggplot(aes(x=POS,y=SAMPLE,
text=paste(
"Sample:", SAMPLE,'\n',
"Gene:",GENE_POS,'\n',
"Position:",POS,'\n',
"Type:",VARTYPE,'\n',
"Effect:",EFFECT,'\n',
"AA Change:",AA_Change,'\n',
sep = ""
))) +
geom_jitter(aes(color=AA_Change))+
labs(x="Gene S",y="SAMPLE", title = "Variants across Gene S ")+
scale_x_continuous(breaks = c(21563,23473,25384))+
theme(plot.title = element_text(hjust = 0.5, face = "bold"))+
theme(axis.text.y = element_blank())
ggplotly(s_plot,tooltip = "text")
df_N<-read.csv("/Users/binitafebles/Desktop/Handley_Lab/Artic_renamed/full_SARS_COV2_MASTER_SPREADSHEET.csv")
df_N1<-df_N%>%
select(Submission_ID, Month.Collected,Pangolin_lineage, Nextclade_lineage,totalMissing,missing)%>%
rename(SAMPLE=Submission_ID)%>%
filter(Pangolin_lineage!="None")
#set all blank cells to NA
df_N1[df_N1==""] <-NA
#remove rows with NA
df_N1<-na.omit(df_N1)
#split col 'missing' with ',' and print in next line
df_N1<-df_N1 %>%
mutate(missing = strsplit(as.character(missing), ",")) %>%
unnest(missing)
#split col 'missing' into two diff columns 'mis_start' and 'mis_end' separating with delimeter '-'
df_N2<-separate(df_N1,col=missing,into = c("mis_start", "mis_end"),sep = "\\-")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 257 rows [9, 13,
## 16, 25, 29, 30, 31, 32, 34, 35, 37, 38, 39, 40, 41, 46, 50, 62, 65, 75, ...].
# remove NA from col mis_start and replace with value from col mis_end
df_N2$mis_start[is.na(df_N2$mis_start)]<-df_N2$mis_end[is.na(df_N2$mis_start)]
# remove NA from col mis_end and replace with value from col mis_start
df_N2$mis_end[is.na(df_N2$mis_end)]<-df_N2$mis_start[is.na(df_N2$mis_end)]
#col mis_start and mis_end are characters, so change it to numeric
df_N2$mis_start<-as.numeric(df_N2$mis_start) # convert character into numeric
## Warning: NAs introduced by coercion
df_N2$mis_end<-as.numeric(df_N2$mis_end)
#add new col 'Total' that calculates total length of missing N at specific position
df_N3<-df_N2 %>%
mutate(Total_mis = mis_end - mis_start)
N_plot<-df_N3%>%
ggplot(aes(x = mis_start, y = SAMPLE, color=Pangolin_lineage))+
geom_linerange(aes(xmin = mis_start, xmax = mis_end))+
scale_x_continuous(breaks = c(0,2500,5000,7500,10000,12500,15000,17500,20000,22500,25000,27500,30000), expand = c(0, 0))+
labs(x="Position",y="Sample", title = "Ns Distribution")+
theme(legend.text = element_text(size=12))+
theme(plot.title = element_text(hjust = 0.5, face = "bold"))+
theme(legend.position = "bottom")
N_plot
## Warning: Removed 2 rows containing missing values (geom_segment).
## Nucleotide Substitution Plot
NC_df<-combined_df5%>%
select(SAMPLE,REF,ALT,VARTYPE)%>%
filter(VARTYPE=="SNP")%>%
group_by(REF,ALT)%>%
summarise(total=n())%>%
mutate(Percentage=round(total/sum(total)*100,2))%>%
ungroup()
## `summarise()` has grouped output by 'REF'. You can override using the `.groups` argument.
NC_add<-data.frame(c("A","C","T","G"),
c("A","C","T","G"),
c(0,0,0,0),
c(0,0,0,0))
names(NC_add)<-c("REF","ALT","total","Percentage")
NC_final_df<-rbind(NC_df,NC_add)
NC_plot<-NC_final_df%>%
ggplot(aes(x=REF,y=ALT,fill=Percentage))+
geom_tile()+
geom_text(aes(label=Percentage),color="white")+
labs(x="Reference",y="Sample",title="Nucleotide Substitution") +
scale_fill_gradient()+
guides(color=FALSE)+
theme(axis.text = element_text(size = 12,face = "bold"))+
theme(plot.title = element_text(hjust = 0.5, face = "bold"))+
theme(panel.background=element_rect(fill = "transparent"))
NC_plot